Portland Metropolitian Area, Oregon
Metro serves three counties in the Portland, Oregon region; Washington County, Multnomah County and Clackamas County
Total population in total 2.51 Million
tm_shape(s1) +
tm_polygons(
col = c('Whi', 'lat'),
lwd = 0.2,
border.col = 'grey80',
title = c("Spatial Distribution of White only Population ", "Spatial Distribution of Hispanic or Latino Population"),
palette = 'RdBu',
n=5,
alpha = 0.6,
midpoint = 0,
contrast =c(0,1),
style = 'quantile') +
tm_tiles("Stamen.TonerLabels") +
tm_layout(main.title = 'Portland Metro Area: Population Distribution')
White only and Hispanic Latino population have the highest percentage of the population.
tm_shape(s1) +
tm_polygons(
col = c('Asi', 'Mix', 'Bla', 'Hawi'),
lwd = 0.2,
border.col = 'grey80',
title =c("Spatial Distribution of Asian Population", "Spatial Distribution of Two or More Races Population", "Spatial Distribution of Black Population", "Spatial Distribution of Native Hawaiian Population"),
palette = 'GnBu',
n=5,
alpha = 0.6,
midpoint = 0,
contrast =c(0,1),
style = 'quantile') +
tm_tiles("Stamen.TonerLabels") +
tm_layout(main.title = 'Portland Metro Area: Asian')
Comparing Private Vehicle to Public Transportation

Public transport vs Private transport
t1_nb <- poly2nb(t1,queen=T)
t1_W <- nb2listw(t1_nb,
style="W",
zero.policy = FALSE)
t11_local_g <- spdep::localG(
x = t1$PrT,
listw = t1_W,
zero.policy = TRUE) %>%
as.vector()
# I made us another function!
make_g_bin <- function(v){
case_when(
v > -1.96 & v < 1.96 ~ "Insignificant",
v >= 1.96 & v < 2.58 ~ "Hotspot (>1.96)",
v >= 2.58 ~ "Hotspot (>2.58)",
v < -1.96 & v > -2.58 ~ 'Coldspot (<-1.96)',
v < -2.58 ~ 'Coldspot (<-2.58)',
TRUE ~ 'Unknown'
) %>% as.factor() %>%
ordered(c(
"Unknown",
"Coldspot (<-2.58)",
"Coldspot (<-1.96)",
"Insignificant",
"Hotspot (>1.96)",
"Hotspot (>2.58)"))
}
g_palette <- c('grey','#0571B0','#92C5DE','cornsilk','#F4A582','#CA0020')
# make_g_bin(v = t11_local_g)
suppressMessages(tmap_mode("view"))
tmap_options(basemaps = c("Stamen.TonerLite","Stamen.Terrain","Esri.WorldTopoMap","Esri.WorldImagery"))
t1 %>% mutate(G=make_g_bin(t11_local_g)) %>%
tm_shape() +
tm_polygons(
col = "G",
palette = g_palette,
border.col = 'grey80',
alpha = 0.5,
lwd=0.4,
title = "Local Getis-Ord G",
popup.vars=c("Tract: " = "NAME", "Percent of population Use Car/Truck/Van to Work (x 100)" = "PrT", "Classification: " = "G"))+
tm_shape(st_union(t1)) +
tm_polygons(zindex = 200,lwd=2,alpha = 0,border.col = "black")+
tm_tiles(leaflet::providers$Stamen.TerrainLabels) +
tm_layout(main.title = "Prublic Transport to Work")-Much of eastern Oregon and western Oregon appear to be Hotspots for the use of private vehicles such as cars, trucks and vans.
t12_local_g <- spdep::localG(
x = t1$PuT,
listw = t1_W,
zero.policy = TRUE) %>%
as.vector()
# I made us another function!
make_g_bin <- function(v){
case_when(
v > -1.96 & v < 1.96 ~ "Insignificant",
v >= 1.96 & v < 2.58 ~ "Hotspot (>1.96)",
v >= 2.58 ~ "Hotspot (>2.58)",
v < -1.96 & v > -2.58 ~ 'Coldspot (<-1.96)',
v < -2.58 ~ 'Coldspot (<-2.58)',
TRUE ~ 'Unknown'
) %>% as.factor() %>%
ordered(c(
"Unknown",
"Coldspot (<-2.58)",
"Coldspot (<-1.96)",
"Insignificant",
"Hotspot (>1.96)",
"Hotspot (>2.58)"))
}
g_palette <- c('grey','#0571B0','#92C5DE','cornsilk','#F4A582','#CA0020')
# make_g_bin(v = t12_local_g)
suppressMessages(tmap_mode("view"))
tmap_options(basemaps = c("Stamen.TonerLite","Stamen.Terrain","Esri.WorldTopoMap","Esri.WorldImagery"))
t1 %>% mutate(G=make_g_bin(t12_local_g)) %>%
tm_shape() +
tm_polygons(
col = "G",
palette = g_palette,
border.col = 'grey80',
alpha = 0.5,
lwd=0.4,
title = "Local Getis-Ord G",
popup.vars=c("Tract: " = "NAME", "Percent of population Use Public Transportation to Work (x 100)" = "PuT", "Classification: " = "G"))+
tm_shape(st_union(t1)) +
tm_polygons(zindex = 200,lwd=2,alpha = 0,border.col = "black")+
tm_tiles(leaflet::providers$Stamen.TerrainLabels) +
tm_layout(main.title = "Public Transportation to Work")-And our Portland seems to be a hotspot for public transit use. We can say Portland is a public transportation friendly area.
moran_plot( #giving values to the function
var = t1$PuT,
w_var = t1$lag_PuT,
xlab = "Public Transport to Wrok(%)",
ylab = "Lag of Public Transport to Work(%)")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

And we see a positive relationship between people who use public transport and their neighbors (here’s “lag”). This means that the neighbors of those who take public transport to work also use public transportation to work.
We need to consider some factors, demographics and locations, to see if they affect the use of public transport.
Let’s see first, where to go?
This included,
bus_stops <- pull_osm_points(tag = "highway",
feature = "bus_stop")#Bus stops locations
tram_stops <- pull_osm_points(tag = "railway",
feature = "tram_stop")#light rail locations
stops <- rbind(bus_stops, tram_stops)
colnames(stops) <- c("osm_id_stops", "stops_name", "geometry")
stops %>% qtm()ooh!! Too Much stops In Portland!
We also need to look at some other locations data that may affect the use of public transportation
Now, see the relationship between the variables individually with the Public Transportation Use
Correlation between the variables and those who use public transport to work
#t3 %>% ggplot(aes(x=mhi, y=put)) +
t3 %>% ggplot(aes(x=mhi, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Median Household Income") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
oh! no… Negative !
#merged_data %>% ggplot(aes(x=edu, y=put)) +
t3 %>% ggplot(aes(x=edu, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Education") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
Slightly Positive !
t3 %>% ggplot(aes(x=hhs, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Household Size") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
Negative again!
#merged_data %>% ggplot(aes(x=ttw, y=put)) +
t3 %>% ggplot(aes(x=ttw, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Travel time to Work") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
Positive !
#merged_data %>% ggplot(aes(x=rac, y=put)) +
t3 %>% ggplot(aes(x=rac, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Race of the Population") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
Negative !
#merged_data %>% ggplot(aes(x=prt, y=put)) +
t3 %>% ggplot(aes(x=prt, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Private Vehicle to Work") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
Strong Negative correlation!
#merged_data %>% ggplot(aes(x=put, y=put)) +
t3 %>% ggplot(aes(x=count_of_stops, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Private Vehicle to Work") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
Finally a proper Positive correlation
Isn’t it obvious?
#merged_data %>% ggplot(aes(x=nov, y=put)) +
#t3 %>% ggplot(aes(x=nov, y=count_of_stops)) +
t3 %>% ggplot(aes(x=nov, y=put)) +
geom_point(color='royalblue', size=1) +
geom_smooth(
formula = 'y ~ x',
method = 'lm',
color="red",
size=0.8,
se=F) +
xlab("Tenure of vehicle_No vehicle") +
ylab("Public Transportation to Work") +
ggtitle("Correlation of Variables")
Its obvious too!!
APositive correlation
Let’s see what happens when all the variables work together
library(tidyverse)
formula <- formula(put ~ mhi + edu + hhs + ttw + rac + count_of_stops + prt + nov + count_of_parks + count_of_rests + count_of_parkings)
mod <- lm(formula, data = t3)##
## Call:
## lm(formula = formula, data = t3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.244994 -0.031652 -0.004356 0.026034 0.288840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.275e-01 1.344e-02 24.365 < 2e-16 ***
## mhi -4.685e-07 6.330e-08 -7.402 2.54e-13 ***
## edu -2.740e-02 1.294e-02 -2.118 0.03441 *
## hhs 5.518e-03 3.778e-03 1.461 0.14436
## ttw 1.056e-04 2.380e-05 4.436 1.00e-05 ***
## rac -9.178e-06 3.189e-06 -2.878 0.00408 **
## count_of_stops 1.328e-03 3.366e-04 3.945 8.47e-05 ***
## prt -2.942e-01 1.201e-02 -24.496 < 2e-16 ***
## nov -3.808e-05 8.950e-05 -0.426 0.67051
## count_of_parks -2.973e-03 3.012e-03 -0.987 0.32389
## count_of_rests 1.351e-03 7.244e-04 1.864 0.06252 .
## count_of_parkings -2.447e-03 1.893e-03 -1.292 0.19647
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05422 on 1177 degrees of freedom
## Multiple R-squared: 0.5008, Adjusted R-squared: 0.4962
## F-statistic: 107.4 on 11 and 1177 DF, p-value: < 2.2e-16
-So,
The median household income, education level, travel time to work, race of the population, stops location and private vehicle to work variables are statistically significant with the public transportation usage (p<0.05)
##
## Call:
## lm(formula = formula, data = t3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.244994 -0.031652 -0.004356 0.026034 0.288840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.275e-01 1.344e-02 24.365 < 2e-16 ***
## mhi -4.685e-07 6.330e-08 -7.402 2.54e-13 ***
## edu -2.740e-02 1.294e-02 -2.118 0.03441 *
## hhs 5.518e-03 3.778e-03 1.461 0.14436
## ttw 1.056e-04 2.380e-05 4.436 1.00e-05 ***
## rac -9.178e-06 3.189e-06 -2.878 0.00408 **
## count_of_stops 1.328e-03 3.366e-04 3.945 8.47e-05 ***
## prt -2.942e-01 1.201e-02 -24.496 < 2e-16 ***
## nov -3.808e-05 8.950e-05 -0.426 0.67051
## count_of_parks -2.973e-03 3.012e-03 -0.987 0.32389
## count_of_rests 1.351e-03 7.244e-04 1.864 0.06252 .
## count_of_parkings -2.447e-03 1.893e-03 -1.292 0.19647
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05422 on 1177 degrees of freedom
## Multiple R-squared: 0.5008, Adjusted R-squared: 0.4962
## F-statistic: 107.4 on 11 and 1177 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = formula, data = t3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.244994 -0.031652 -0.004356 0.026034 0.288840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.275e-01 1.344e-02 24.365 < 2e-16 ***
## mhi -4.685e-07 6.330e-08 -7.402 2.54e-13 ***
## edu -2.740e-02 1.294e-02 -2.118 0.03441 *
## hhs 5.518e-03 3.778e-03 1.461 0.14436
## ttw 1.056e-04 2.380e-05 4.436 1.00e-05 ***
## rac -9.178e-06 3.189e-06 -2.878 0.00408 **
## count_of_stops 1.328e-03 3.366e-04 3.945 8.47e-05 ***
## prt -2.942e-01 1.201e-02 -24.496 < 2e-16 ***
## nov -3.808e-05 8.950e-05 -0.426 0.67051
## count_of_parks -2.973e-03 3.012e-03 -0.987 0.32389
## count_of_rests 1.351e-03 7.244e-04 1.864 0.06252 .
## count_of_parkings -2.447e-03 1.893e-03 -1.292 0.19647
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05422 on 1177 degrees of freedom
## Multiple R-squared: 0.5008, Adjusted R-squared: 0.4962
## F-statistic: 107.4 on 11 and 1177 DF, p-value: < 2.2e-16
Make sense, right?
## mhi edu hhs ttw
## 2.310387 2.617762 1.841092 1.368497
## rac count_of_stops prt nov
## 1.563823 1.468361 1.531758 1.043644
## count_of_parks count_of_rests count_of_parkings
## 1.233374 1.407742 1.175481

There are some large outlier which has made a trend with our residual.
data.frame(
residuals = mod$residuals^2 %>% sqrt(),
fitted = mod$fitted.values
) %>% ggplot(aes(x=fitted,y=residuals)) +
geom_point() +
xlab("Fitted Values") +
ylab("Standardized Residuals") +
geom_smooth(
method = 'gam',
col='red',se = FALSE) +
ggtitle("Residuals vs. Fitted Values")## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

There are more extreme points with the higher residuals rather than the lower ones.
suppressPackageStartupMessages(require(spdep))
suppressPackageStartupMessages(require(spatialreg)) # This is new to me!
t3_nb <- poly2nb(pl = t3, queen = TRUE)
t3_W <- nb2listw(t3_nb,style="W")
LM <- lm.LMtests(mod, t3_W, test = "all")
summary(LM)## Lagrange multiplier diagnostics for spatial dependence
## data:
## model: lm(formula = formula, data = t3)
## weights: t3_W
##
## statistic parameter p.value
## LMerr 24.8833 1 6.091e-07 ***
## LMlag 30.5669 1 3.226e-08 ***
## RLMerr 1.3429 1 0.246524
## RLMlag 7.0264 1 0.008031 **
## SARMA 31.9098 2 1.177e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So, both the LMerr and LMlag models are significant.
##
## Call:spatialreg::lagsarlm(formula = put ~ mhi + edu + hhs + ttw +
## rac + count_of_stops + prt + nov + count_of_parks + count_of_rests +
## count_of_parkings, data = t3, listw = t3_W, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2609765 -0.0311176 -0.0044053 0.0246747 0.2934326
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.9047e-01 1.5361e-02 18.9099 < 2.2e-16
## mhi -3.9485e-07 6.2889e-08 -6.2785 3.418e-10
## edu -3.3597e-02 1.2682e-02 -2.6491 0.008070
## hhs 5.1656e-03 3.7021e-03 1.3953 0.162921
## ttw 9.3407e-05 2.3379e-05 3.9954 6.460e-05
## rac -7.1310e-06 3.1526e-06 -2.2619 0.023704
## count_of_stops 1.0512e-03 3.3154e-04 3.1707 0.001521
## prt -2.6712e-01 1.3022e-02 -20.5124 < 2.2e-16
## nov -6.7101e-05 8.7775e-05 -0.7645 0.444594
## count_of_parks -4.6063e-03 2.9557e-03 -1.5585 0.119123
## count_of_rests 8.8919e-04 7.1396e-04 1.2454 0.212973
## count_of_parkings -2.2335e-03 1.8550e-03 -1.2040 0.228572
##
## Rho: 0.19523, LR test value: 28.846, p-value: 7.8381e-08
## Asymptotic standard error: 0.03562
## z-value: 5.4809, p-value: 4.2322e-08
## Wald statistic: 30.04, p-value: 4.2322e-08
##
## Log likelihood: 1798.977 for lag model
## ML residual variance (sigma squared): 0.0028216, (sigma: 0.053119)
## Nagelkerke pseudo-R-squared: 0.5128
## Number of observations: 1189
## Number of parameters estimated: 14
## AIC: -3570, (AIC for lm: -3543.1)
## LM test for residual autocorrelation
## test value: 0.43328, p-value: 0.51038
and
##
## Call:spatialreg::errorsarlm(formula = put ~ mhi + edu + hhs + ttw +
## rac + count_of_stops + prt + nov + count_of_parks + count_of_rests +
## count_of_parkings, data = t3, listw = t3_W, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2541163 -0.0316929 -0.0032053 0.0260888 0.2812569
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.3593e-01 1.4092e-02 23.8389 < 2.2e-16
## mhi -4.2198e-07 6.4180e-08 -6.5750 4.867e-11
## edu -3.6575e-02 1.3455e-02 -2.7183 0.006561
## hhs 3.2501e-03 3.7965e-03 0.8561 0.391961
## ttw 9.6584e-05 2.3308e-05 4.1438 3.415e-05
## rac -9.2742e-06 3.1487e-06 -2.9454 0.003226
## count_of_stops 1.0727e-03 3.5077e-04 3.0580 0.002228
## prt -2.9455e-01 1.2580e-02 -23.4152 < 2.2e-16
## nov -6.1306e-05 8.7667e-05 -0.6993 0.484358
## count_of_parks -4.7166e-03 2.9793e-03 -1.5831 0.113394
## count_of_rests 1.1322e-03 7.4397e-04 1.5218 0.128050
## count_of_parkings -2.0886e-03 1.8901e-03 -1.1050 0.269148
##
## Lambda: 0.23582, LR test value: 24.327, p-value: 8.1275e-07
## Asymptotic standard error: 0.044587
## z-value: 5.2889, p-value: 1.2304e-07
## Wald statistic: 27.973, p-value: 1.2304e-07
##
## Log likelihood: 1796.718 for error model
## ML residual variance (sigma squared): 0.0028235, (sigma: 0.053137)
## Nagelkerke pseudo-R-squared: 0.51095
## Number of observations: 1189
## Number of parameters estimated: 14
## AIC: -3565.4, (AIC for lm: -3543.1)
## [1] 0.05313707
## df AIC
## mod_lag 14 -3569.954
## mod_err 14 -3565.435
Though the difference is very little but it is telling that Spatial Lag Model would have better performance